{
 "metadata": {
  "name": "",
  "signature": "sha256:0572f58810bcde678e56e4c78ad5181092637198a0b58fe081990b0b1540ee52"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Labelling the class values for the twitter dataset.\n",
      "import os\n",
      "input_filename = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\", \"python_tweets.json\")\n",
      "classes_filename = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"twitter\", \"python_classes.json\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 125
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import json\n",
      "tweets = []\n",
      "with open(input_filename) as inf:\n",
      "    for line in inf:\n",
      "        if len(line.strip()) == 0:\n",
      "            continue\n",
      "        tweets.append(json.loads(line)['text'])\n",
      "print(\"Loaded {} tweets\".format(len(tweets)))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Loaded 200 tweets\n"
       ]
      }
     ],
     "prompt_number": 126
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open(classes_filename) as inf:\n",
      "    labels = json.load(inf)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 127
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "n_samples = min(len(tweets), len(labels))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 128
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sample_tweets = [t.lower() for t in tweets[:n_samples]]\n",
      "labels = labels[:n_samples]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 129
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "y_true = np.array(labels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 130
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(\"{:.1f}% have class 1\".format(np.mean(y_true == 1) * 100))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "52.0% have class 1\n"
       ]
      }
     ],
     "prompt_number": 131
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.base import TransformerMixin\n",
      "from nltk import word_tokenize\n",
      "\n",
      "class NLTKBOW(TransformerMixin):\n",
      "    def fit(self, X, y=None):\n",
      "        return self\n",
      "\n",
      "    def transform(self, X):\n",
      "        return [{word: True for word in word_tokenize(document)}\n",
      "                 for document in X]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 132
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.feature_extraction import DictVectorizer"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 134
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.naive_bayes import BernoulliNB"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 138
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.cross_validation import cross_val_score\n",
      "from sklearn.pipeline import Pipeline\n",
      "from sklearn.feature_extraction.text import CountVectorizer"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 140
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pipeline = Pipeline([('bag-of-words', NLTKBOW()),\n",
      "                     ('vectorizer', DictVectorizer()),\n",
      "                     ('naive-bayes', BernoulliNB())\n",
      "                     ])\n",
      "scores = cross_val_score(pipeline, sample_tweets, y_true, cv=10, scoring='f1')\n",
      "print(\"Score: {:.3f}\".format(np.mean(scores)))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Score: 0.799\n"
       ]
      }
     ],
     "prompt_number": 147
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scores"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 148,
       "text": [
        "array([ 0.77777778,  0.86956522,  0.77777778,  0.8       ,  0.82352941,\n",
        "        0.82352941,  0.82352941,  0.58823529,  0.75      ,  0.95238095])"
       ]
      }
     ],
     "prompt_number": 148
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 142
    }
   ],
   "metadata": {}
  }
 ]
}